#!pip install "plotly == 4.14.3"
#!pip install "matplotlib == 3.3.4"
#!pip install -U pipenv
#!pip install "flair == 0.9"
#!pip install "gensim == 3.8.3"
#!pip install "scikit-learn == 0.24.1"
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from flair.models import TextClassifier
from flair.data import Sentence
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsTransformer
import re
#Load Dataframe
df = pd.read_csv("Data/BeerDataScienceProject.csv", encoding='latin-1')
#Convert timestamps to DateTime
df["review_time"] = pd.to_datetime(df["review_time"], unit="s")
#Check for null Values
df.isna().apply(lambda ser: ser.value_counts()).T
#Remove all null datapoints
df = df.dropna()
Different ways to answer this question:
#Get top 3 Brewaries according to highest ABV
top3_abv = df.groupby("beer_brewerId").max().sort_values("beer_ABV", ascending=False).head(3)
#Get top 3 Brewaries according to highest ABV
top3_abv_avg = df.groupby("beer_brewerId").mean().sort_values("beer_ABV", ascending=False).head(3)
fig = go.Figure()
fig.add_trace(go.Bar(x=list(top3_abv_avg.index.astype(str)), y=top3_abv_avg["beer_ABV"].values.tolist(), marker_color="#96D294"))
fig.update_layout(
title = "Top 3 Breweries which Produce Strongest Beers on Average",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.add_hline(y=df.groupby("beer_brewerId").mean().mean()["beer_ABV"], line_width=3, line_dash="dash", row=3, col="all", line_color="salmon",
annotation_text="Average",
annotation_position="bottom right")
fig.update_xaxes(title="Brewery ID")
fig.update_yaxes(title="Beer ABV")
fig = go.Figure()
fig.add_trace(go.Bar(x=list(top3_abv_avg.index.astype(str)), y=top3_abv_avg["beer_ABV"].values.tolist(), marker_color="#96D294"))
fig.update_layout(
title = "Top 3 Breweries which Produce Strongest Beers on Average",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.add_hline(y=df.groupby("beer_brewerId").mean().mean()["beer_ABV"], line_width=3, line_dash="dash", row=3, col="all", line_color="salmon",
annotation_text="Average",
annotation_position="bottom right")
fig.update_xaxes(title="Brewery ID")
fig.update_yaxes(title="Beer ABV")
Get the average overall rating per year and plot
#Sort by time and resample to a yearly period taking the mean
yearly_df = df.set_index("review_time").sort_index().resample("y").mean()
fig = go.Figure()
fig.add_trace(go.Scatter(x=yearly_df.index, y=yearly_df["review_overall"], marker_color="#96D294"))
fig.add_annotation(x="2000-12-31", y=4.233333,
text="2000 - 4.23",
showarrow=True,
arrowhead=1,
font=dict(
family="Courier New, monospace",
size=18,
color="#ffffff",
),
align="center",
bgcolor="#ff7f0e",
bordercolor="#c7c7c7",
arrowwidth=2)
fig.update_layout(
title = "Average Overall Beer Ratings Between 1998 and 2012",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Year")
fig.update_yaxes(title="Average Overall Rating")
#Get correlations among all review categories
review_correlations = df[[i for i in df.columns if "review" in i]].corr().round(2)
#Process Column names of correlation
review_correlations.columns = ["Appearance", "Palette", "Overall", "Taste", "Aroma"]
review_correlations.index = ["Appearance", "Palette", "Overall", "Taste", "Aroma"]
#Create Fiure
fig, ax = plt.subplots(figsize=(10,10))
im = ax.imshow(review_correlations)
#Set figure Size
ax.set_xticks(np.arange(review_correlations.shape[1]))
ax.set_yticks(np.arange(review_correlations.shape[1]))
#label With Features
ax.set_xticklabels(review_correlations.columns)
ax.set_yticklabels(review_correlations.columns)
plt.yticks(fontsize=18,)
plt.xticks(fontsize=18,)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
for i in range(review_correlations.shape[1]):
for j in range(review_correlations.shape[1]):
text = ax.text(j, i, review_correlations.values[i, j],
ha="center", va="center", color="w")
ax.set_title("Feature Correlations", fontsize=22)
fig.tight_layout()
plt.show()
overall_review = review_correlations["Overall"].drop("Overall").sort_values(ascending=False)
fig = go.Figure()
fig.add_trace(go.Bar(x=overall_review.index, y=overall_review, marker_color="salmon"))
fig.update_layout(
title = "Correlation Between Overall Review and Other Factors",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Factors")
fig.update_yaxes(title="Correlation")
def mutate(x):
return x + np.random.uniform(-0.1, 0.1)
#sample down data
sample_df = df.sample(5000)
#add random mutation
sample_df["review_aroma"] = sample_df["review_aroma"].apply(lambda x: mutate(x))
sample_df["review_overall"] = sample_df["review_overall"].apply(lambda x: mutate(x))
fig = go.Figure()
fig.add_trace(go.Scatter(x=sample_df["review_aroma"], y=sample_df["review_overall"], mode="markers"))
fig.update_layout(
title = "Overall Rating Compared to Aroma Rating",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Aroma Rating")
fig.update_yaxes(title="Overall Rating")
To solve this, will get the beers with the overall best review that are closest to the average ABV (dont want it to be too powerful)
#Sort beers closes to average abv
df["abv_avg_distance"] = (df["beer_ABV"] - df["beer_ABV"].mean()).abs().sort_values()
recommended_3 = df.sort_values(["review_overall", "abv_avg_distance"], ascending=[False, True]).iloc[:4]
To solve this question, we follow two approaches:
#Approach 1
beer_style_df = df.groupby("beer_style").mean()
top_10_beer_style = beer_style_df["review_overall"].sort_values(ascending=False).head(10)
fig = go.Figure()
fig.add_trace(go.Bar(x=top_10_beer_style.index, y=top_10_beer_style, marker_color="salmon"))
fig.update_layout(
title = "top 10 Beer Styles by Average Overall Ratings",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Style")
fig.update_yaxes(title="Overall Rating")
#Approach 2
#User Flair for quick sentiment analysis
classifier = TextClassifier.load('en-sentiment')
#Sample dataframe for speed calculation
sample_df = df.sample(5000)
#Prepare list for sentiment values
review_sentiment = []
for review in sample_df["review_text"]:
#Create Sentence of review
sentence = Sentence(review)
#Predict Sentiment of review
classifier.predict(sentence)
#Store value of sentiment, if negative store as - and if positive store as +
sentiment = sentence.labels[0].score
if sentence.labels[0].value == "NEGATIVE":
sentiment = sentiment*-1
review_sentiment.append(sentiment)
#Add sentiment value to dataframe
sample_df["review_sentiment"] = review_sentiment
#Approach 1
beer_sentiment_df = sample_df.groupby("beer_style").mean()
top_10_beer_sentiment = beer_sentiment_df["review_sentiment"].sort_values(ascending=False).head(10)
fig = go.Figure()
fig.add_trace(go.Bar(x=top_10_beer_sentiment.index, y=top_10_beer_sentiment, marker_color="salmon"))
fig.update_layout(
title = "top 10 Beer Styles by Average Written Rating Sentiment",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Style")
fig.update_yaxes(title="Overall Rating")
#Get Positive and Negative reviews
df["sentiment"] = df["review_overall"].apply(lambda x: "positive" if x > 2.5 else "negative")
sample_df = pd.concat([df[df["sentiment"] == "positive"].sample(500), df[df["sentiment"] == "negative"].sample(500)])
sample_df.shape
#load model
model = KeyedVectors.load_word2vec_format('Models/GoogleNews-vectors-negative300.bin', binary=True)
def get_text_embeddings(text):
empty_word = np.zeros(300)
text = text.replace("\xa0", " ")
text = text.replace("\n", " ")
text = re.sub(r'[.,()!@#$?]', '', text)
text = remove_stopwords(text)
all_words = []
for word in text.split():
if len(all_words) <=120:
try:
all_words.append(model[word])
except:
all_words.append(empty_word)
while len(all_words) <=120:
all_words.append(empty_word)
return np.array(all_words).flatten()
#Get embeddings
sample_df["review_text_embeddings"] = sample_df["review_text"].apply(lambda x: get_text_embeddings(x))
#Create SVM
sentiment_features = np.array(sample_df["review_text_embeddings"].values.tolist())
sentiment_targets = sample_df["sentiment"]
x_train, x_test, y_train, y_test = train_test_split(sample_df, sentiment_targets)
sentiment_model = SVC().fit(np.array(x_train["review_text_embeddings"].values.tolist()), y_train)
#custom model
predictions = sentiment_model.predict(np.array(x_test["review_text_embeddings"].values.tolist()))
accuracy_score(y_test, predictions)
flair_predictions = []
for text in x_test["review_text"]:
sentence = Sentence(review)
#Predict Sentiment of review
classifier.predict(sentence)
if sentence.labels[0].value == "NEGATIVE":
flair_predictions.append("negative")
else:
flair_predictions.append("positive")
accuracy_score(y_test, flair_predictions)
fig = go.Figure()
fig.add_trace(go.Bar(x=["Custom", "Flair"], y=[accuracy_score(y_test, predictions), accuracy_score(y_test, flair_predictions)], marker_color="#96D294"))
fig.update_layout(
title = "Custom Sentiment Analysis vs Flair",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Model",)
fig.update_yaxes(title="Accuracy", tickformat="%")
To solve this question we need to compare the overall rating and the given sentiment of the text. In order to do this we:
def normalize(x):
return (x - 2.5) / 2.5
df["review_overall_normalized"] = df["review_overall"].apply(lambda x: normalize(x))
fig = go.Figure()
fig.add_trace(go.Histogram(x=df["review_overall_normalized"], marker_color="salmon"))
fig.update_layout(
title = "Normalized Overall Review Distribution",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Rating")
fig.update_yaxes(title="Number of Reviews")
fig = go.Figure()
fig.add_trace(go.Histogram(x=review_sentiment, marker_color="salmon"))
fig.update_layout(
title = "Sentiment Distribution",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Sentiment")
fig.update_yaxes(title="Number of Reviews")
fig = go.Figure()
fig.add_trace(go.Bar(x=["Custom", "Flair"], y=[accuracy_score(y_test, predictions), accuracy_score(y_test, flair_predictions)], marker_color="#96D294"))
fig.update_layout(
title = "Custom Sentiment Analysis vs Flair",
yaxis=dict(zeroline=False, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
width=700,
font=dict(size=14)
)
fig.update_xaxes(title="Model",)
fig.update_yaxes(title="Accuracy", tickformat="%")
In order to find similar drinkers, we need to compare the reviews written by the different users by comparing the content within them. Use embeddings from before and build a KNN classifier to get the 5 most similar reviewers
embeddings = np.array(sample_df["review_text_embeddings"].values.tolist())
knn = KNeighborsTransformer().fit(embeddings)
neighbors = knn.kneighbors([sample_df[sample_df["review_profileName"] == "Brad007"]["review_text_embeddings"].iloc[0]], return_distance=False)
for index in neighbors[0][1:]:
print(sample_df.iloc[index]["review_profileName"])